1. Data import and manipulation for county level death and population

data source:

https://usafacts.org/visualizations/coronavirus-covid-19-spread-map/state/new-york

# df1 = 
# read.csv(file = "https://static.usafacts.org/public/data/covid-19/covid_deaths_usafacts.csv?_ga=2.213522960.567718552.1605752195-1031006592.1605647663")


# population
pop_df = 
  read.csv(file = "./covid_county_population_usafacts.csv") %>% 
  janitor::clean_names() %>% 
  filter(state == "NY") %>% 
  select(-state)


# deaths
death_df = 
  read.csv(file = "./covid_deaths_usafacts.csv") %>% 
  janitor::clean_names() %>% 
  filter(state == "NY") %>% 
  pivot_longer(
    cols = 4:305,
    names_to = "test_date",
    values_to = "cummulative_death"
  ) %>% 
  mutate(
    test_date = str_remove(test_date, pattern = "x"),
    test_date = str_replace(test_date, pattern = "_", "/"),
    test_date = str_replace(test_date, pattern = "_", "/"),
    test_date = str_replace(test_date, pattern = "20$","2020")
  ) %>% 
  select(-state)

# join dfs
death_pop = 
  left_join(pop_df, death_df, by = "county_fips") %>% 
  select(county_fips, county_name.x, population, test_date, cummulative_death) %>% 
  rename("county" = "county_name.x") %>% 
  group_by(test_date, county) %>% 
  relocate(test_date, county_fips, county) %>% 
  mutate(
    county = str_remove(county, pattern = ".County")
  ) %>% 
  select(-county_fips)

2. Data merging with new_york_statewide_covid_19_testing.csv data

data source:

ny_statewide_data = 
  read.csv("./New_York_State_Statewide_COVID-19_Testing.csv") %>% 
  janitor::clean_names() 

# join death_pop data into test data
ny_state_test_pop_mor = 
  left_join(ny_statewide_data, death_pop, by = c("test_date","county")) %>% 
  mutate(
    mortality_rate = cummulative_death / cumulative_number_of_positives, 
    test_date = as.Date(test_date, "%m/%d/%y"),
    month = months(test_date),
    year = format(test_date, format = "%y"),
  ) 

# visualize daily fatality rate
daily_fatal_plot1 = 
  ny_state_test_pop_mor %>% 
  ggplot(aes(test_date, mortality_rate, group = county)) +
  geom_line(stat = "identity", aes(color = county)) + 
  theme_light() +
  scale_x_date(name = "date", date_breaks = "1 month")

daily_fatal_plot1 =
  daily_fatal_plot1 %>% 
  ggplotly()
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
daily_fatal_plot1

3. Data formating: aggregte daily data to monthly data

# aggregate cummulative variables by taking maximum() for monthly data
ny_month_df1 = 
  aggregate(
    cbind(cumulative_number_of_positives, 
          cummulative_death,
          cumulative_number_of_tests_performed) 
    ~ month + county, 
    ny_state_test_pop_mor, 
    max) %>% 
    mutate(
      month_fatal = cummulative_death / cumulative_number_of_positives
    )

# aggregate incremental variables by taking sum() for montly data
ny_month_df2 = 
  aggregate(
    cbind(new_positives, 
          total_number_of_tests_performed,
          population)
    ~month + county,
    ny_state_test_pop_mor,
    sum) 

# combine 2 dfs, complete monthly data for fatality rate
ny_state_monthly_testpopmor = 
  left_join(ny_month_df1, ny_month_df2, by = c("month", "county"))

# manipulation for variables

# visualization monthly fatality rate in each state, by each month
monthly_fatal_plot1 = 
  ny_state_monthly_testpopmor %>% 
  ggplot(aes(month, month_fatal, group = county)) +
  geom_line(aes(color = county))+
  theme_light() 

monthly_fatal_plot1 =
  monthly_fatal_plot1 %>% 
  ggplotly()

monthly_fatal_plot1

save as csv

write_csv(ny_state_test_pop_mor, path = "./ny_state_test_pop_mor.csv")